# Import pandas
import pandas as pd
user_usage = pd.read_csv("user_uses.csv")
user_device = pd.read_csv("user_device.csv")
devices = pd.read_csv("supported_devices.csv")
devices.rename(columns={"Retail Branding": "manufacturer"}, inplace=True)
user_device.head()
devices.head(5)
We're trying to get the average usage for different types of devices. So we need to get the user's device code from user_usage as a column on user_usage, and then get the device's manufacturer from devices as a column on the result.
First, we merge user_usage with user_device with "use_id" as our common column
result = pd.merge(user_usage,
user_device[['use_id', 'platform', 'device']],
on='use_id')
result.head()
A left merge, or left join, between two dataframes keeps all the records from the left dataframe, in this case "user_usage". Rows from the right dataframe will be kept in the result only where there is a match in the merge variable in the right dataframe, and NaN values will be in the result where not.
result_left = pd.merge(user_usage,
user_device[['use_id', 'platform', 'device']],
on='use_id', how='left')
print("user_usage dimensions: {}".format(user_usage.shape))
print("result dimensions: {}".format(result.shape))
print("There are {} missing values in the result.".format(
result['device'].isnull().sum()))
result_left.head()
A right merge, or right join, between two dataframes keeps all of the records from the right dataframe, in this case "user_device". Rows from the left dataframe will be kept where there is a match in the merge variable, and NaN values will be in the result where it's not.
result_right = pd.merge(user_usage,
user_device[['use_id', 'platform', 'device']],
on='use_id', how='right')
print("user_device dimensions: {}".format(user_device.shape))
print("result dimensions: {}".format(result.shape))
print("There are {} missing values in the 'monthly_mb' column in the result.".format(
result['monthly_mb'].isnull().sum()))
print("There are {} missing values in the 'platform' column in the result.".format(
result['platform'].isnull().sum()))
result_right.head()
A full outer join, or outer merge, keeps all rows from the left and right dataframe in the result. Rows will be aligned where there is shared join values between the left and right. Rows with NaN values, in either the left-originating or right-originating columns will be. it will be left in the result where there is no shared join value.
print("There are {} unique values of use_id in our dataframes.".format(
pd.concat([user_usage['use_id'], user_device['use_id']]).unique().shape[0]))
result_outer = pd.merge(user_usage,
user_device[['use_id', 'platform', 'device']],
on='use_id', how='outer', indicator=True)
print("Outer merge result has {} rows.".format(result.shape))
print("There are {} rows with no missing values.".format(
(result_outer.apply(lambda x: x.isnull().sum(), axis=1) == 0).sum()))
result_outer.head()
# First merge user_usage and user_device to add the platform and device to the user usage.
result = pd.merge(user_usage,
user_device[['use_id', 'platform', 'device']],
on='use_id',
how='left')
# Now, based on the "device" column in result, match the "Model" column in devices.
devices.rename(columns={"Retail Branding": "manufacturer"}, inplace=True)
result = pd.merge(result,
devices[['manufacturer', 'Model']],
left_on='device',
right_on='Model',
how='left')
result.sample(5)
# You can apply filter on dataframe according to your reuirement.
devices[devices.Device.str.startswith('GT')]